set.seed(04232021)

library(caret)
library(e1071)
library(ggplot2)
library(glmnet)
library(glmnetUtils)
if (!exists("expr", inherits=FALSE)) {
  if (file.exists("../data/processed/expr.Rda")) {
    load("../data/processed/expr.Rda")
  } else {
    expr <- read.csv("../data/raw/Brain_GSE50161.csv")
    dir.create("../data/processed", showWarnings=FALSE)
    save(expr, file="../data/processed/expr.Rda")
  }
}
head(expr)
class_histogram <- ggplot(expr, aes(type)) + geom_bar(stat="count") + ggtitle("Distribution of Brain Cancer Types") + xlab("Type") + ylab("Count")
ggsave("../reports/figures/class_histogram.png", class_histogram)
Saving 7 x 5 in image
class_histogram

Looks like the data are imbalanced. The low number of observations for “normal” is concerning.

train_index <- createDataPartition(expr$type, p=0.5, list=FALSE)
X_train <- data.matrix(expr[train_index,-(1:2)])
y_train <- expr[train_index,2]
X_test <- data.matrix(expr[-train_index,-(1:2)])
y_test <- expr[-train_index,2]

Let’s begin with a simple approach, a one-vs-all logistic regression classifier.

types <- unique(expr$type)
models <- lapply(types, function(type) {
  y_train_type <- ifelse(y_train == type, 1, 0)
  model <- cv.glmnet(X_train, y_train_type, family=binomial)
  return(model)
})
ova_y_pred <- as.data.frame(predict(models, X_test, type="response"))
colnames(ova_y_pred) <- types
y_pred <- colnames(ova_y_pred)[max.col(ova_y_pred, "first")]
confusionMatrix(as.factor(y_test), as.factor(y_pred))
Confusion Matrix and Statistics

                       Reference
Prediction              ependymoma glioblastoma medulloblastoma normal pilocytic_astrocytoma
  ependymoma                    23            0               0      0                     0
  glioblastoma                   0           14               0      1                     2
  medulloblastoma                0            0              11      0                     0
  normal                         0            1               0      5                     0
  pilocytic_astrocytoma          0            1               0      0                     6

Overall Statistics
                                         
               Accuracy : 0.9219         
                 95% CI : (0.827, 0.9741)
    No Information Rate : 0.3594         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.8962         
                                         
 Mcnemar's Test P-Value : NA             

Statistics by Class:

                     Class: ependymoma Class: glioblastoma Class: medulloblastoma Class: normal Class: pilocytic_astrocytoma
Sensitivity                     1.0000              0.8750                 1.0000       0.83333                      0.75000
Specificity                     1.0000              0.9375                 1.0000       0.98276                      0.98214
Pos Pred Value                  1.0000              0.8235                 1.0000       0.83333                      0.85714
Neg Pred Value                  1.0000              0.9574                 1.0000       0.98276                      0.96491
Prevalence                      0.3594              0.2500                 0.1719       0.09375                      0.12500
Detection Rate                  0.3594              0.2188                 0.1719       0.07812                      0.09375
Detection Prevalence            0.3594              0.2656                 0.1719       0.09375                      0.10938
Balanced Accuracy               1.0000              0.9062                 1.0000       0.90805                      0.86607

Very good for a first pass!

LS0tCnRpdGxlOiAiRXhwbG9yYXRvcnkgRGF0YSBBbmFseXNpcyIKYXV0aG9yOiAiUGV0ZXIgVHJhbiIKZGF0ZTogIjQvMjMvMjAyMSIKb3V0cHV0OiBodG1sX2RvY3VtZW50Ci0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmtuaXRyOjpvcHRzX2NodW5rJHNldChlY2hvID0gVFJVRSkKYGBgCmBgYHtyfQpzZXQuc2VlZCgwNDIzMjAyMSkKCmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkoZTEwNzEpCmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeShnbG1uZXQpCmxpYnJhcnkoZ2xtbmV0VXRpbHMpCmBgYAoKYGBge3J9CmlmICghZXhpc3RzKCJleHByIiwgaW5oZXJpdHM9RkFMU0UpKSB7CiAgaWYgKGZpbGUuZXhpc3RzKCIuLi9kYXRhL3Byb2Nlc3NlZC9leHByLlJkYSIpKSB7CiAgICBsb2FkKCIuLi9kYXRhL3Byb2Nlc3NlZC9leHByLlJkYSIpCiAgfSBlbHNlIHsKICAgIGV4cHIgPC0gcmVhZC5jc3YoIi4uL2RhdGEvcmF3L0JyYWluX0dTRTUwMTYxLmNzdiIpCiAgICBkaXIuY3JlYXRlKCIuLi9kYXRhL3Byb2Nlc3NlZCIsIHNob3dXYXJuaW5ncz1GQUxTRSkKICAgIHNhdmUoZXhwciwgZmlsZT0iLi4vZGF0YS9wcm9jZXNzZWQvZXhwci5SZGEiKQogIH0KfQpgYGAKCmBgYHtyfQpoZWFkKGV4cHIpCmBgYAoKYGBge3J9CmNsYXNzX2hpc3RvZ3JhbSA8LSBnZ3Bsb3QoZXhwciwgYWVzKHR5cGUpKSArIGdlb21fYmFyKHN0YXQ9ImNvdW50IikgKyBnZ3RpdGxlKCJEaXN0cmlidXRpb24gb2YgQnJhaW4gQ2FuY2VyIFR5cGVzIikgKyB4bGFiKCJUeXBlIikgKyB5bGFiKCJDb3VudCIpCmdnc2F2ZSgiLi4vcmVwb3J0cy9maWd1cmVzL2NsYXNzX2hpc3RvZ3JhbS5wbmciLCBjbGFzc19oaXN0b2dyYW0pCmNsYXNzX2hpc3RvZ3JhbQpgYGAKCkxvb2tzIGxpa2UgdGhlIGRhdGEgYXJlIGltYmFsYW5jZWQuIFRoZSBsb3cgbnVtYmVyIG9mIG9ic2VydmF0aW9ucyBmb3IgIm5vcm1hbCIgaXMgY29uY2VybmluZy4KCmBgYHtyfQp0cmFpbl9pbmRleCA8LSBjcmVhdGVEYXRhUGFydGl0aW9uKGV4cHIkdHlwZSwgcD0wLjUsIGxpc3Q9RkFMU0UpClhfdHJhaW4gPC0gZGF0YS5tYXRyaXgoZXhwclt0cmFpbl9pbmRleCwtKDE6MildKQp5X3RyYWluIDwtIGV4cHJbdHJhaW5faW5kZXgsMl0KWF90ZXN0IDwtIGRhdGEubWF0cml4KGV4cHJbLXRyYWluX2luZGV4LC0oMToyKV0pCnlfdGVzdCA8LSBleHByWy10cmFpbl9pbmRleCwyXQpgYGAKCkxldCdzIGJlZ2luIHdpdGggYSBzaW1wbGUgYXBwcm9hY2gsIGEgb25lLXZzLWFsbCBsb2dpc3RpYyByZWdyZXNzaW9uIGNsYXNzaWZpZXIuCgpgYGB7cn0KdHlwZXMgPC0gdW5pcXVlKGV4cHIkdHlwZSkKbW9kZWxzIDwtIGxhcHBseSh0eXBlcywgZnVuY3Rpb24odHlwZSkgewogIHlfdHJhaW5fdHlwZSA8LSBpZmVsc2UoeV90cmFpbiA9PSB0eXBlLCAxLCAwKQogIG1vZGVsIDwtIGN2LmdsbW5ldChYX3RyYWluLCB5X3RyYWluX3R5cGUsIGZhbWlseT1iaW5vbWlhbCkKICByZXR1cm4obW9kZWwpCn0pCmBgYAoKYGBge3J9Cm92YV95X3ByZWQgPC0gYXMuZGF0YS5mcmFtZShwcmVkaWN0KG1vZGVscywgWF90ZXN0LCB0eXBlPSJyZXNwb25zZSIpKQpjb2xuYW1lcyhvdmFfeV9wcmVkKSA8LSB0eXBlcwp5X3ByZWQgPC0gY29sbmFtZXMob3ZhX3lfcHJlZClbbWF4LmNvbChvdmFfeV9wcmVkLCAiZmlyc3QiKV0KY29uZnVzaW9uTWF0cml4KGFzLmZhY3Rvcih5X3Rlc3QpLCBhcy5mYWN0b3IoeV9wcmVkKSkKYGBgCgpWZXJ5IGdvb2QgZm9yIGEgZmlyc3QgcGFzcyE=